#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# QQ: 34538980@qq.com
# Jekkay Hu, 2013.5.5
# ///////////////////////////////////////////////////////////////////
#                            _ooOoo_                               //
#                           o8888888o                              //
#                           88" . "88                              //
#                           (| ^_^ |)                              //
#                           O\  =  /O                              //
#                        ____/`---'\____                           //
#                      .'  \\|     |//  `.                         //
#                     /  \\|||  :  |||//  \                        //
#                    /  _||||| -:- |||||-  \                       //
#                    |   | \\\  -  /// |   |                       //
#                    | \_|  ''\---/''  |   |                       //
#                    \  .-\__  `-`  ___/-. /                       //
#                  ___`. .'  /--.--\  `. . ___                     //
#                ."" '<  `.___\_<|>_/___.'  >'"".                  //
#              | | :  `- \`.;`\ _ /`;.`/ - ` : | |                 //
#              \  \ `-.   \_ __\ /__ _/   .-` /  /                 //
#        ========`-.____`-.___\_____/___.-`____.-'========         //
#                             `=---='                              //
#        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^        //
#             佛祖保佑       永无BUG        运行正常                   //
#     -------------------------------------------------------      //
#               QQ: 34538980@qq.com                                //
#               博客: http://www.easysb.cn                          //
#               Jekkay Hu, 2013.5.5                                //
# ///////////////////////////////////////////////////////////////////
#
# 从搜索引擎搜索指定的关键词，采集初始域名
#

import time
import logging
from conf.context import context
from conf.const import SCAN_STATE_INIT
from utility.function import load_dict_from_file, get_url, extract_all_links, is_legal_domain
from scan.base_db_op_thread import BaseDBOpThread
from scan.links_processor import LinksProcessor

# 定义查询的基本语句
SQL_CHECK_AVAILABLE = 'SELECT COUNT(1) AS c FROM `domain` WHERE SCAN = {}'.format(SCAN_STATE_INIT)
SQL_QUERY_DOMAIN = 'SELECT * FROM `domain` WHERE `domain`=%(domain)s'
SQL_INSERT_DOMAIN = 'INSERT IGNORE INTO `domain`(`id`, `domain`,`is_http`, `is_https`,`scan`,`create_time`) VALUES (%(id)s,%(domain)s,%(is_http)s,%(is_https)s,%(scan)s, %(create_time)s)'
BAIDU_SEARCH_URL = 'https://www.baidu.com/s?wd={keyword}&pn={offset}&oq={keyword}&ie=utf-8&usm=2&rsv_pq=8ef1cae500352aa0&rsv_t=9528fYOLehqCGYxajyvaGO5oiUV%2B%2FKXmcCZKmpnhXyxJ2MJQ2vl7J6RZals'
# 每次搜索引擎的延时（秒）
GOOGLE_SEARCH_URL = 'https://www.google.com/search?q={keyword}&safe=active&start={offset}'
_CHECK_INTERVAL = 10


class SearchKeywordsThread(BaseDBOpThread):
    def __init__(self):
        super(SearchKeywordsThread, self).__init__()
        # 当剩余域名小于1000个时候就开始采集
        self.threshold = 1000
        self.keywords = []
        self.total = 0
        # 从第1页开始(0)
        self.page = 0
        # 当前缓存
        self.cache = []

    def work_loop(self):
        if 0 <= self._check_available() < self.threshold:
            # 重新加载搜索关键词
            self._load_keywords_list()
            count = self._search_domain_by_keywords()
            logging.info("find domain count %s, total %s， cur page %s" % (count, self.total, self.page))
            # 最多100页
            self.page = (self.page + 1) % 100
        # 数据库断开
        self._close_db_connect()
        # 缓存
        if len(self.cache) > 2000:
            self.cache = self.cache[1000:]
        global _CHECK_INTERVAL
        logging.info('search thread sleep %s seconds, (。-ω-)zzz' % _CHECK_INTERVAL)
        time.sleep(_CHECK_INTERVAL)

    def _check_available(self):
        if (not self.db) or (not self.db.is_connected()):
            logging.error("fail to connect to db")
            return -1
        result = self.db.query(SQL_CHECK_AVAILABLE)
        c = result[0]['c'] if result and len(result) > 0 and result[0]['c'] else 0
        logging.info('current domain ready count %s' % c)
        return c

    def _load_keywords_list(self):
        self.keywords = load_dict_from_file('%s/resources/keywords.txt' % context.main_folder)

    def _search_domain_by_keywords(self):
        if not self.keywords:
            self.keywords = ['cms']
        c = 0
        for keyword in self.keywords:
            logging.info('start to search, page: %s, keywords: %s' % (self.page + 1, keyword))
            pattern = self._get_search_engine()
            url = pattern.format(keyword=keyword, offset=self.page * 10)
            c += self._search_domain(url)
            logging.info('search url: %s, keyword: %s, new domain count %s' % (url, keyword, c))
            time.sleep(2)
        return c

    def _get_search_engine(self):
        if not context.setting['search'] or context.setting['search']['baidu']:
            return BAIDU_SEARCH_URL
        return GOOGLE_SEARCH_URL

    def _search_domain(self, url, follow=True):
        links_count = 0
        try:
            content = get_url(url)
            if not content:
                return links_count
            domain_list = LinksProcessor.extract_domain_list(content, follow)
            for d in domain_list:
                key = '%s_%s_%s' % (d.domain, d.is_http, d.is_https)
                # 已经存在就不需要写入数据库
                if key in self.cache:
                    continue
                self.cache.append(key)
                if not self._is_exist_db(d):
                    logging.info('[search engine] find new domain %s' % d.domain)
                    self._save_to_db(d)
                    links_count += 1
        except Exception as e:
            print(e)
        return links_count

    def _is_exist_db(self, d):
        try:
            result = self.db.query(SQL_QUERY_DOMAIN, {'domain': d.domain})
            return result and len(result) > 0
        except Exception as e:
            print(e)

    def _save_to_db(self, d):
        try:
            self.db.execute(SQL_INSERT_DOMAIN, {
                'id': d.id,
                'domain': d.domain,
                'is_http': d.is_http,
                'is_https': d.is_https,
                'scan': SCAN_STATE_INIT,
                'create_time': d.create_time
            })
            logging.info('find new domain %s' % d.domain)
        except Exception as e:
            logging.error('_save_to_db exception' + str(e))
